Preparation

Loading the packages

library(tidyverse) # Data transformation and visualization
library(scales) # Enables percentage scale for ggplot graphs
library(rpart) # Decision Trees: the CART algorithm
library(C50) # Another implementation of decision trees in R: C5.0
library(party) # Another tree-based algorithm: conditional inference trees
library(rpart.plot) # Nice visualization of rpart Decision Trees
library(partykit) # Visualization of Conditional Inference and C5.0 Trees
library(mlr) # MAchine learning framework for R. Here we are using model accuracy metrics provided by the package.

Data Preparation

d <- readRDS("turnover.RDS")

# Creating training and test samples
set.seed(1234)
train_indices <-  sample(1:nrow(d), size = 2000)
d_train <- d[train_indices, ]
d_test <- d[-train_indices, ]

# What's inside the training sample
glimpse(d_train)
## Observations: 2,000
## Variables: 10
## $ satisfaction_level    <dbl> 9.9, 7.2, 7.0, 4.2, 5.9, 8.2, 7.2, 4.4, ...
## $ last_evaluation       <dbl> 6.8, 5.3, 5.3, 4.9, 7.9, 9.7, 8.8, 5.4, ...
## $ number_project        <dbl> 5, 3, 2, 2, 3, 5, 3, 2, 5, 5, 2, 5, 6, 3...
## $ average_monthly_hours <dbl> 264, 179, 274, 152, 217, 263, 224, 139, ...
## $ time_in_company       <dbl> 10, 3, 4, 3, 4, 5, 3, 3, 3, 4, 3, 2, 4, ...
## $ work_accident         <fct> yes, no, no, no, no, no, no, no, no, no,...
## $ left                  <fct> stayed, stayed, left, left, stayed, left...
## $ promotion_last_5years <fct> no, no, no, no, no, no, no, no, no, no, ...
## $ department            <fct> support, support, sales, support, produc...
## $ salary                <fct> medium, low, low, low, medium, medium, l...

Modeling using CART algorithm

A tree for two quantitative predictors

cmap <- c("stayed" = "green", "left" = "red")

ggplot(d, aes(x = satisfaction_level, y = last_evaluation)) +
  geom_jitter(aes(colour = left), alpha = 0.5) +
  labs(title = "Employee's decision to quit, job satisfaction and performance", colour = "Decision") +
  scale_x_continuous(breaks = 1:10) +
  scale_colour_manual(values = cmap)

m_rpart2 <- rpart(left ~ satisfaction_level + last_evaluation,
                 data = d_train)
rpart.plot(m_rpart2, tweak = 1.5,
           main = "Decision tree for satisfaction and evaluation")

print(m_rpart2)
## n= 2000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2000 470 stayed (0.766 0.234)  
##    2) satisfaction_level>=4.6 1454 140 stayed (0.904 0.096) *
##    3) satisfaction_level< 4.6 546 220 left (0.399 0.601)  
##      6) satisfaction_level>=1.1 422 200 stayed (0.517 0.483)  
##       12) satisfaction_level< 3.5 170  13 stayed (0.924 0.076) *
##       13) satisfaction_level>=3.5 252  61 left (0.242 0.758)  
##         26) last_evaluation>=5.8 36   3 stayed (0.917 0.083) *
##         27) last_evaluation< 5.8 216  28 left (0.130 0.870)  
##           54) last_evaluation< 4.4 10   0 stayed (1.000 0.000) *
##           55) last_evaluation>=4.4 206  18 left (0.087 0.913) *
##      7) satisfaction_level< 1.1 124   0 left (0.000 1.000) *

Plotting the decision boundaryx

Since we have just two quantitative predictors, we can visualize the decision boundary.

# Creating a data grid from 0 to 10 with 0.1 step size 
gr <- expand.grid(satisfaction_level = seq(0, 10, by = 0.1),
             last_evaluation = seq(3, 10, by = 0.1))
# Adding the predicted class labels and turnover probability
boundary_rpart2 <- gr %>%
  mutate(prob = predict(m_rpart2, newdata = gr, type = "prob")[,'left'],
         class = predict(m_rpart2, newdata = gr, type = "class"))
ggplot(boundary_rpart2, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = prob)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_colour_manual(values = cmap) +
  scale_fill_gradient(low = "grey80", high = "grey20")

ggplot(boundary_rpart2, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = class)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_fill_grey(start = 0.8, end = 0.2) +
  scale_colour_manual(values = cmap)

Comparing the split evaluation criteria

m_rpart2_gini <- rpart(left ~ satisfaction_level + last_evaluation , 
                         data = d_train, 
                         parms = list(split = "gini"))
rpart.plot(m_rpart2_gini, tweak = 1.25,
           main = "Decision tree based on Gini index")

m_rpart2_information <- rpart(left ~ satisfaction_level + last_evaluation, 
                         data = d_train, 
                         parms = list(split = "information"))
rpart.plot(m_rpart2_information, tweak = 1.25,
           main = "Decision tree based on Information Entropy")

Model accuracy

The Confusion Matrix cross-tabulates the predicted and the actual class labels.

To create the confusion matrix, we add predicted class labels and use the table() function to cross-tabulate them against the actual class labels.

# Predicted class labels
pr_rpart2_train <- predict(m_rpart2, newdata = d_train, type = "class")

# Confusion matrix
table_rpart2_train <- table(predicted = pr_rpart2_train, actual = d_train$left)
table_rpart2_train
##          actual
## predicted stayed left
##    stayed   1514  156
##    left       18  312

Let’s calculate some model performance metrics on the training data:

options(digits = 3)
# Accuracy
100 * (table_rpart2_train["stayed", "stayed"] + 
         table_rpart2_train["left", "left"]) / sum(table_rpart2_train)
## [1] 91.3
# Mean Misclassification Error
100 * (table_rpart2_train["stayed", "left"] + 
         table_rpart2_train["left", "stayed"]) / sum(table_rpart2_train)
## [1] 8.7

The above metrics can be readily calculated with the model metrics provided by the mlr package:

# Accuracy
measureACC(pr_rpart2_train, d_train$left) * 100
## [1] 91.3
# Mean misclassification error
measureMMCE(pr_rpart2_train, d_train$left) * 100
## [1] 8.7

Let’s compare the accuracy on the test sample

pr_rpart2_test <- predict(m_rpart2, newdata = d_test, type = "class")
measureACC(pr_rpart2_test, d_test$left) * 100
## [1] 90.7
table(predicted = pr_rpart2_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    766   90
##    left        3  141

The accuracy is very similar to the accuracy on the training sample. This is good, our model didn’t overfit the training data.

Setting the algorithm’s parameters

Decision trees are prone to overfitting the training data. It is technically possible to grow a very large tree, that classifies the training data perfectly. However the rules added to the tree will capture not only the true relationships in the data, but also the noise. Such a tree would be unnecessarily complex, and won’t achieve good accuracy on the new data.

The implementation of the tree growing algorithm of R has reasonable defaults that prevent overfitting. To illustrate the point of overfitting, we’ll change the parameters of the algorithm, to relax the early stopping and the tree pruning rules.

m_rpart2_overfit <- rpart(left ~ satisfaction_level + last_evaluation, 
                          data = d_train,
                          control = 
                            rpart.control(
                              minsplit = 2, # Minimum node size for splitting
                              cp = 0.00001, # minimum relative improvement for splitting
                              maxdepth = 20 # maximum depth of a leaf node
                            ))

rpart.plot(m_rpart2_overfit)

The decision boundary for an overfit tree

# Adding predicted class labels and turnover probabilities
boundary_rpart2_overfit <- gr %>%
  mutate(prob = predict(m_rpart2_overfit, newdata = gr, type = "prob")[,'left'],
         class = predict(m_rpart2_overfit, newdata = gr, type = "class"))

# The decision boundary
ggplot(boundary_rpart2_overfit, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = prob)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_colour_manual(values = cmap) +
  scale_fill_gradient(low = "grey80", high = "grey20")

Comparing the performance of the overfit model on training and test samples

pr_rpart2_overfit_train <- predict(m_rpart2_overfit, newdata = d_train, type = "class")
pr_rpart2_overfit_test <- predict(m_rpart2_overfit, newdata = d_test, type = "class")

The confusion matrix and accuracy on the training data

table(predicted = pr_rpart2_overfit_train, actual = d_train$left)
##          actual
## predicted stayed left
##    stayed   1511   32
##    left       21  436
measureACC(pr_rpart2_overfit_train, d_train$left) * 100
## [1] 97.4

The confusion matrix and accuracy on the test data

table(predicted = pr_rpart2_overfit_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    733   66
##    left       36  165
measureACC(pr_rpart2_overfit_test, d_test$left) * 100
## [1] 89.8

Tree pruning

Plotting model error vs complexity parameter (cp)

plotcp(m_rpart2_overfit)

Tree pruning

m_rpart2_pruned <- prune(m_rpart2_overfit, cp = 0.005)
rpart.plot(m_rpart2_pruned, cex = 0.9)

The accuracy after pruning

pr_rpart2_pruned_train <- predict(m_rpart2_pruned, newdata = d_train, type = "class")
pr_rpart2_pruned_test <- predict(m_rpart2_pruned, newdata = d_test, type = "class")

Confusion matrix and accuracy for the pruned tree - training sample

table(predicted = pr_rpart2_pruned_train, actual = d_train$left)
##          actual
## predicted stayed left
##    stayed   1511  138
##    left       21  330
measureACC(pr_rpart2_pruned_train, d_train$left) * 100
## [1] 92

Confusion matrix and accuracy for the pruned tree - test sample

table(predicted = pr_rpart2_pruned_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    764   82
##    left        5  149
measureACC(pr_rpart2_pruned_test, d_test$left) * 100
## [1] 91.3

After pruning, the drop in accuracy on the test sample is much smaller.

The decision boundary for the pruned tree

# Adding the predicted class labels and turnover probabilities
boundary_rpart2_pruned <- gr %>%
  mutate(prob = predict(m_rpart2_pruned, newdata = gr, type = "prob")[,'left'],
         class = predict(m_rpart2_pruned, newdata = gr, type = "class"))

# Plotting the decision boundary
ggplot(boundary_rpart2_pruned, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = prob)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_colour_manual(values = cmap) +
  scale_fill_gradient(low = "grey80", high = "grey20")

Growing the conditional inference tree (party)

Another decision tree algorithm available in R is Conditional Inference Tree. This algorithm is implemented by the party package.

m_ctree2 <- ctree(left ~ satisfaction_level + last_evaluation, data = d_train)
plot(m_ctree2)

The decision boundary for ctree with 2 predictors

# Adding predicted class labels and turnover probability
boundary_ctree2 <- gr %>%
  mutate(prob = predict(m_ctree2, newdata = gr, type = "prob")[,'left'],
         class = predict(m_ctree2, newdata = gr, type = "response"))

# Plotting the decision boundary
ggplot(boundary_ctree2, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = prob)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_colour_manual(values = cmap) +
  scale_fill_gradient(low = "grey80", high = "grey20")

The accuracy for ctree

pr_ctree2_test <- predict(m_ctree2, newdata = d_test, type = "response")

The confusion matrix and accuracy for ctree - test sample

table(predicted = pr_ctree2_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    665   85
##    left      104  146
measureACC(pr_ctree2_test, d_test$left) * 100
## [1] 81.1

Modeling using C5.0

The C5.0 algorithm by Ross Quinlan is a very popular decision tree algorithm, that is implemented in many commercial data mining packages. Recently the source code for this algorithm has been published by the author under an open license, so the algorithm is now available in R.

m_c50 <- C5.0(left ~ satisfaction_level + last_evaluation, data = d_train)
plot(m_c50)

The decision boundary for a C5.0 tree

# Adding predicted class labels and turnover probabilities
boundary_c50 <- gr %>%
  mutate(prob = predict(m_c50, newdata = gr, type = "prob")[,'left'],
         class = predict(m_c50, newdata = gr, type = "class"))

# Plotting the decision boundary
ggplot(boundary_c50, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = prob)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_colour_manual(values = cmap) +
  scale_fill_gradient(low = "grey80", high = "grey20")

The accuracy of C5.0 tree

pr_c50_test <- predict(m_c50, newdata = d_test, type = "class")

The confusion matrix and accuracy for C5.0 tree - test sample

table(predicted = pr_c50_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    763   77
##    left        6  154
measureACC(pr_c50_test, d_test$left) * 100
## [1] 91.7

In this example, the performance is very close to the rpart.

Trying to improve performance using boosting

The accuracy can be improved by combining the predictions from the ensemble of several models. The C5.0 packages implements one such approach, that is called adaptive boosting. This algorithm grows a sequence of trees, that are trained to correct the errors from the previously built trees. After the ensemble is built, the prediction is made by all the trees in it, and the class label is determined by voting.

The trials parameter controls the number of additional trees to include in the ensemble. This is the maximal number of trees, the actual number can be smaller, when additional trees are proved not to improve performance.

m_c50_boost <- C5.0(left ~ satisfaction_level + last_evaluation, 
                    data = d_train, trials = 5)
m_c50_boost
## 
## Call:
## C5.0.formula(formula = left ~ satisfaction_level + last_evaluation, data
##  = d_train, trials = 5)
## 
## Classification Tree
## Number of samples: 2000 
## Number of predictors: 2 
## 
## Number of boosting iterations: 5 
## Average tree size: 7.6 
## 
## Non-standard options: attempt to group attributes

Plotting the decision boundary for the ensemble of C5.0 trees with boosting

# Adding predicted class labels and turnover probability
boundary_c50_boost <- gr %>%
  mutate(prob = predict(m_c50_boost, newdata = gr, type = "prob")[,'left'],
         class = predict(m_c50_boost, newdata = gr, type = "class"))

# The decision boundary
ggplot(boundary_c50_boost, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = prob)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_colour_manual(values = cmap) +
  scale_fill_gradient(low = "grey80", high = "grey20")

The performance of C5.0 boosted tree ensemble

pr_c50_boost_test <- predict(m_c50_boost, newdata = d_test, type = "class")

The confusion matrix and accuracy for the boosted c5.0 ensemble - test data

table(predicted = pr_c50_boost_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    732   48
##    left       37  183
measureACC(pr_c50_boost_test, d_test$left) * 100
## [1] 91.5

The accuracy didn’t improve.

Cost-sensitive classification

In many cases the negative consequences for different kinds of classification error are not equal. For example, when a fraud detection model misses one fraudulent transaction, the loss is significantly higher compared to the ‘false alarm’ error.

Similarly, when the model looks for customers who can be interested in a product or service, the loss of missing a potential customer is higher compared to the cost of making an additional call or sending an email.

To take into account the misclassification cost, some alrgorithms allow to specify different cost for different kinds of the misclassification error.

In our case we will specify 5 times higher cost for ‘missed opportunity’ type error: it’s more important to determine if an employee is considering leaving the company, and to try to prevent the loss of a valuable employee.

cost_matrix <- matrix(c(0, 5, 1, 0 ), ncol = 2, byrow = TRUE,
                      dimnames = list("predicted" = c("stayed", "left"),
                                      "actual" = c("stayed", "left")))
cost_matrix
##          actual
## predicted stayed left
##    stayed      0    5
##    left        1    0

Creating the cost-sensitive classifier

m_c50_cost <- C5.0(left ~ satisfaction_level + last_evaluation, 
                   data = d_train, cost = cost_matrix)
plot(m_c50_cost)

The decision boundary for the cost sensitive C5.0 classifier

# Adding the class labels
boundary_c50_cost <- gr %>%
  mutate(class = predict(m_c50_cost, newdata = gr, type = "class"))

# Plotting the decision boundary
ggplot(boundary_c50_cost, aes(satisfaction_level, last_evaluation)) +
  geom_raster(aes(fill = class)) +
  geom_jitter(aes(x = satisfaction_level, 
                  y = last_evaluation, 
                  colour = left), 
              data = d_train) +
  scale_colour_manual(values = cmap) +
  scale_fill_manual(values = c("grey80","grey20"))

The accuracy of the cost-sensitive C5.0 model

pr_c50_cost_test <- predict(m_c50_cost, newdata = d_test, type = "class")

The confusion matrix and the accuracy for the cost-sensitive c5.0 classifier - test sample:

table(predicted = pr_c50_cost_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    642   14
##    left      127  217
measureACC(pr_c50_cost_test, d_test$left) * 100
## [1] 85.9

An alternative method for visualizing rpart trees

For small trees, the visualization used by the party package for ctree-produced trees is more concise.

The trees created by rpart can be also visualized this way. To do so, use the partykit::as.party() function.

rpart.plot(m_rpart2, main = "Default visualization for rpart tree")

plot(as.party(m_rpart2), main = "rpart tree visualized by party")

Comparing the performance of models using all the predictors

rpart

m_rpart_all <- rpart(left ~ ., data = d)
rpart.plot(m_rpart_all, cex = 1)

Confusion matrix for rpart - test sample

pr_rpart_all_test <- predict(m_rpart_all, newdata = d_test, type = "class")
table(predicted = pr_rpart_all_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    756   15
##    left       13  216

Accuracy for rpart - test sample

measureACC(pr_rpart_all_test, d_test$left) * 100
## [1] 97.2

ctree

m_ctree_all <- ctree(left ~ ., data = d)
plot(m_ctree_all)

Confusion matrix for ctree - test sample

pr_ctree_all_test <- predict(m_ctree_all, newdata = d_test, type = "response")
table(predicted = pr_ctree_all_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    726   19
##    left       43  212

Accuracy for ctree - test sample

measureACC(pr_ctree_all_test, d_test$left) * 100
## [1] 93.8

C5.0

m_c50_all <- C5.0(left ~ ., data = d)
plot(m_c50_all)

Confusion matrix for c5.0 - test sample

pr_c50_all_test <- predict(m_c50_all, newdata = d_test, type = "class")
table(predicted = pr_c50_all_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    762   15
##    left        7  216

Accuracy for C5.0 - test sample

measureACC(pr_c50_all_test, d_test$left) * 100
## [1] 97.8

C5.0 with boosting

m_c50_boost_all <- C5.0(left ~ ., data = d, trials = 100)
m_c50_boost_all
## 
## Call:
## C5.0.formula(formula = left ~ ., data = d, trials = 100)
## 
## Classification Tree
## Number of samples: 3000 
## Number of predictors: 9 
## 
## Number of boosting iterations: 100 
## Average tree size: 23.8 
## 
## Non-standard options: attempt to group attributes

Confusion matrix for boosted c5.0 - test sample

pr_c50_boost_all_test <- predict(m_c50_boost_all, newdata = d_test, type = "class")
table(predicted = pr_c50_boost_all_test, actual = d_test$left)
##          actual
## predicted stayed left
##    stayed    767   10
##    left        2  221

Accuracy for boosted C5.0 - test sample

measureACC(pr_c50_boost_all_test, d_test$left) * 100
## [1] 98.8